library(tidyverse)
library(sf)
library(viridis)
library(lubridate)
library(tsibble)
library(fable)
library(fabletools)
library(feasts)
library(janitor)
library(rnaturalearth)
library(rnaturalearthdata)
library(ggridges)
library(knitr)
data = read_csv("dynamic_supply_chain_logistics_dataset.csv", show_col_types = FALSE) %>%
rename(lon = vehicle_gps_longitude, lat = vehicle_gps_latitude) %>%
mutate(
lon = suppressWarnings(as.numeric(lon)),
lat = suppressWarnings(as.numeric(lat)),
timestamp = lubridate::ymd_hms(timestamp, quiet = TRUE),
delay_hours = pmax(suppressWarnings(as.numeric(delivery_time_deviation)), 0)
) %>%
filter(!is.na(lon), !is.na(lat), !is.na(timestamp))Forecast Reconciliation for SCM
1 Libraries and Loading Data
2 Visualisations
We provide delay probabilities overlaid on the US map, and a subset of it on the SoCal region.
# Base map
us = ne_states(country = "United States of America",
returnclass = "sf")
us_conus = subset(us, !name %in% c("Alaska","Hawaii","Puerto Rico")) |>
st_transform(4326)
pts = st_as_sf(data, coords = c("lon","lat"), crs = 4326, remove = FALSE)
# Clip points inside
inside = lengths(st_within(pts, us_conus)) > 0
pts_in = pts[inside, , drop = FALSE]
# thin for legibility
set.seed(1)
pts_plot = if (nrow(pts_in) > 80000) pts_in[sample(nrow(pts_in), 80000), ] else pts_in
us_outline = st_cast(us_conus, "MULTILINESTRING")
# Plot delay probability
color_var = "delay_probability"
has_color = color_var %in% names(pts_plot)
p = ggplot() +
geom_sf(data = us_conus, fill = "grey98", color = "grey90", linewidth = 0.2) +
{ if (has_color)
geom_point(data = st_drop_geometry(pts_plot),
aes(x = lon, y = lat, color = .data[[color_var]]),
alpha = 0.35, size = 0.25)
else
geom_point(data = st_drop_geometry(pts_plot),
aes(x = lon, y = lat),
color = "steelblue", alpha = 0.35, size = 0.25)
} +
geom_sf(data = us_outline, fill = NA, color = "grey20", linewidth = 0.3) +
coord_sf(xlim = st_bbox(us_conus)[c("xmin","xmax")],
ylim = st_bbox(us_conus)[c("ymin","ymax")],
expand = FALSE, clip = "on") +
{ if (has_color) scale_color_viridis_c(name = color_var, limits = c(0,1)) } +
labs(title = "Delay Probabilities: All States",
x = "Longitude", y = "Latitude") +
theme_minimal(base_size = 12) +
theme(panel.grid = element_blank())
print(p)2.1 Southern California
# California + SoCal bbox with border
# Polygon
ca = rnaturalearth::ne_states(country = "United States of America",
returnclass = "sf") |>
subset(name == "California") |>
st_transform(4326)
# points as sf
pts_all = st_as_sf(data, coords = c("lon","lat"), crs = 4326, remove = FALSE)
# only in CA
pts_ca = st_intersection(pts_all, ca)
# crop
socal_bbox = c(xmin = -121, ymin = 32, xmax = -114, ymax = 36) # note: st_bbox expects xmin,ymin,xmax,ymax
bbox_poly = st_as_sfc(st_bbox(socal_bbox, crs = sf::st_crs(4326)))
pts_socal = st_intersection(pts_ca, bbox_poly)
cat("CA points:", nrow(pts_ca), " | SoCal-in-CA points:", nrow(pts_socal), "\n")CA points: 613 | SoCal-in-CA points: 408
# plot
ca_outline = sf::st_cast(ca, "MULTILINESTRING")
p_socal = ggplot() +
geom_sf(data = ca, fill = "grey98", color = "grey85", linewidth = 0.2) +
# bubble layer
geom_sf(
data = pts_socal,
aes(color = delay_probability, size = delay_probability),
alpha = 0.55
) +
geom_sf(data = ca_outline, fill = NA, color = "grey20", linewidth = 0.4) +
coord_sf(xlim = c(socal_bbox["xmin"], socal_bbox["xmax"]),
ylim = c(socal_bbox["ymin"], c(socal_bbox["ymax"])),
expand = FALSE, clip = "on") +
scale_color_viridis_c(name = "delay_probability", limits = c(0, 1)) +
scale_size_continuous(range = c(2, 5), name = "delay_probability", guide = "none") +
labs(title = "Delay Probabilities: Southern California",
x = "Longitude", y = "Latitude") +
theme_minimal(base_size = 12) +
theme(panel.grid = element_blank())
print(p_socal)3 Variables
For reconciliation, we should use an additive target variable. Thus, we define a new target based on target D (Delivery Time Deviation) as total delay time, given as \[T=\sum \max\{D_i, 0\},\quad D_i=\text{delivery time deviation}.\]For this reason, we add the delay_time variable.
data = data %>% mutate(delay_time = pmax(delivery_time_deviation, 0))We also add a State variable for the two-level hierarchy on State/cell_id.
# 1) Make fields numeric + additive ingredient (row-level)
data = data %>%
mutate(
lon = suppressWarnings(as.numeric(lon)),
lat = suppressWarnings(as.numeric(lat)),
timestamp = ymd_hms(timestamp, quiet = TRUE),
delay_hours = pmax(as.numeric(delivery_time_deviation), 0)
) %>%
filter(!is.na(lon), !is.na(lat), !is.na(timestamp))
# 2) Attach US state (for two-level hierarchy: TOTAL → state → cell_id)
us = ne_states(country = "United States of America", returnclass = "sf")
us_conus = subset(us, !name %in% c("Alaska","Hawaii","Puerto Rico")) %>% st_transform(4326)
pts = st_as_sf(data, coords = c("lon","lat"), crs = 4326, remove = FALSE)
data = st_join(pts, us_conus["name"], left = TRUE) %>% st_drop_geometry()
names(data)[names(data) == "name"] = "state"
data = filter(data, !is.na(state))3.1 EDA: Delay Time vs Cargo Condition Status
data = data %>% mutate(cargo =
if_else(as.numeric(cargo_condition_status) <= 0.5, "Poor", "Good"))
ggplot(data, aes(x = cargo, y = delay_time, fill = cargo)) +
geom_boxplot(outlier.alpha = 0.15, width = 0.65) +
labs(title = "Delay time by cargo condition",
x = NULL, y = "Delay time (hours)") +
theme_bw()ggplot(data, aes(x = delay_time, y = cargo, fill = cargo)) +
geom_density_ridges(alpha = 0.8, scale = 1.1, rel_min_height = 0.001, color = "white") +
scale_fill_viridis_d(guide = "none") +
labs(title = "Distribution of delay time by cargo condition",
x = "Delay time (hours)", y = NULL) +
theme_bw()4 Forecast
We first generate the hierarchical time series data associated with data. We group the data by months as well to get better totals; we tried hourly (default) and daily but don’t get any good results.
# Make month + cargo
data = data %>%
mutate(
Month = yearmonth(timestamp),
cargo = if_else(as.numeric(cargo_condition_status) <= 0.5, "Poor", "Good")
) %>%
filter(!is.na(state))
# One row per Month × state × cargo (sum within month)
monthly = data %>%
group_by(Month, state, cargo) %>%
summarise(total_delay_hours = sum(delay_hours, na.rm = TRUE), .groups = "drop")
# complete panel
ts_m = monthly %>%
as_tsibble(index = Month, key = c(state, cargo)) %>%
fill_gaps(.full = TRUE) %>%
mutate(total_delay_hours = coalesce(total_delay_hours, 0)) %>%
aggregate_key(state / cargo, total_delay_hours = sum(total_delay_hours))4.1 Train-Test Split
# Split (last 12 months)
n_test = 12
cutoff = max(ts_m$Month, na.rm = TRUE) - n_test
train_data = ts_m %>% filter(Month <= cutoff)
test_data = ts_m %>% filter(Month > cutoff)When we try to fit an ETS model, we generally get NULL models for a lot of these entries, so we counteract this by doing the specified ETS model for the non-NULL models, and set the rest as a TSLM model.
- ETS (additive) captures level/trend/season for non-negative flows and adapts well when there’s actual signal.
- TSLM(~1) (intercept-only) is a safe fallback for all-zero or ultra-sparse series; it yields a valid (typically zero) forecast instead of a
, keeping the hierarchy intact for reconciliation.
# Base fit
fit = train_data %>% model(ETS = ETS(total_delay_hours ~ error("A") + season("A")))
# Replace NULLs in-place with TSLM(~1)
fb = train_data %>% model(ETS = TSLM(total_delay_hours ~ 1))
idx = is_null_model(fit$ETS)
fit$ETS[idx] = fb$ETS[idx]4.2 Reconciliation
# Reconcile & forecast
recon_fc = fit %>%
reconcile(
BU = bottom_up(ETS),
OLS = min_trace(ETS, "ols"),
WLS = min_trace(ETS, "wls_struct")
) %>%
forecast(h = n_test)4.2.1 Diagnostics
rt = recon_fc %>%
as_tibble() %>%
filter(.model %in% c("BU","OLS","WLS"))
# TOTAL vs sum of STATES (each month)
totals = rt %>% filter(is_aggregated(state), is_aggregated(cargo)) %>%
select(.model, Month, total = .mean)
sum_states = rt %>% filter(!is_aggregated(state), is_aggregated(cargo)) %>%
group_by(.model, Month) %>%
summarise(sum_states = sum(.mean, na.rm = TRUE), .groups = "drop")
left_join(totals, sum_states, by = c(".model","Month")) %>%
mutate(diff = round(total - sum_states, 10)) %>%
group_by(.model) %>%
summarise(max_abs_diff = max(abs(diff), na.rm = TRUE), .groups = "drop")# A tibble: 3 × 2
.model max_abs_diff
<chr> <dbl>
1 BU 0
2 OLS 0
3 WLS 0
# For each state: parent vs sum of its cargo children
parents = rt %>% filter(!is_aggregated(state), is_aggregated(cargo)) %>%
select(.model, Month, state, parent = .mean)
children = rt %>% filter(!is_aggregated(state), !is_aggregated(cargo)) %>%
group_by(.model, Month, state) %>%
summarise(sum_children = sum(.mean, na.rm = TRUE), .groups = "drop")
left_join(parents, children, by = c(".model","Month","state")) %>%
mutate(diff = round(parent - sum_children, 10)) %>%
group_by(.model) %>%
summarise(max_abs_diff = max(abs(diff), na.rm = TRUE), .groups = "drop")# A tibble: 3 × 2
.model max_abs_diff
<chr> <dbl>
1 BU 0
2 OLS 0
3 WLS 0
5 Comparison
acc = accuracy(recon_fc, test_data)
res = acc %>%
group_by(.model) %>%
summarise(MAE = mean(MAE, na.rm = TRUE),
RMSE = mean(RMSE, na.rm = TRUE)) %>%
arrange(RMSE)
kable(res, align = "ccc")| .model | MAE | RMSE |
|---|---|---|
| OLS | 13.67204 | 17.13070 |
| WLS | 13.72854 | 17.25368 |
| ETS | 13.86745 | 17.37866 |
| BU | 13.99175 | 17.61708 |
Seems that OLS is the best performing model.
5.1 Plot
# History: California state total (cargo aggregated)
ca_hist = ts_m %>%
filter(state == "California", is_aggregated(cargo))
# Reconciled forecasts: California state total (cargo aggregated)
recon_ca = recon_fc %>%
filter(state == "California", is_aggregated(cargo))
autoplot(recon_ca, level = NULL) +
autolayer(ca_hist, total_delay_hours, alpha = 0.3) +
labs(
title = "Reconciled forecasts by method – California (state total)",
y = "Total delay-hours (monthly)"
) +
theme_bw()Overall seems like even with reconciliation, we don’t get a more accurate model. This could be attributed to the input variable not being very “significant” in difference.